The objective of this notebook is to discover Quora insincere questions' topics, aka target = 1.
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
import warnings, time, gc
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook, reset_output
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.io import save, output_file
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE
from wordcloud import WordCloud
np.random.seed(32)
color = sns.color_palette("Set2")
warnings.filterwarnings("ignore")
stop_words = set(stopwords.words("english"))
punctuations = string.punctuation
output_notebook()
%matplotlib inline
train = pd.read_csv("../input/train.csv")
train.head()
train.isna().sum()
target_count = train["target"].value_counts()
plt.figure(figsize = (8, 5))
ax = sns.barplot(target_count.index, target_count.values)
rects = ax.patches
labels = target_count.values
for rect, label in zip(rects, labels):
ax.text(rect.get_x() + rect.get_width()/2, rect.get_height() + 5,
label, ha = "center", va = "bottom")
plt.show()
train["quest_len"] = train["question_text"].apply(lambda x: len(x.split()))
sincere = train[train["target"] == 0]
insincere = train[train["target"] == 1]
plt.figure(figsize = (15, 8))
sns.distplot(sincere["quest_len"], hist = True, label = "sincere")
sns.distplot(insincere["quest_len"], hist = True, label = "insincere")
plt.legend(fontsize = 10)
plt.title("Questions Length Distribution by Class", fontsize = 12)
plt.show()
# Credit: https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda
lem = WordNetLemmatizer()
tokenizer = TweetTokenizer()
def clean_text(question):
"""
This function receives comments and returns clean word-list
"""
#Convert to lower case , so that Hi and hi are the same
question = question.lower()
#remove \n
question = re.sub("\\n", "", question)
#remove disteacting single quotes
question = re.sub("\'", "", question)
# remove new line characters
# question = re.sub('s+', " ", question)
#Split the sentences into words
words = tokenizer.tokenize(question)
# (')aphostophe replacement (ie) you're --> you are
# ( basic dictionary lookup : master dictionary present in a hidden block of code)
words = [APPO[word] if word in APPO else word for word in words]
words = [lem.lemmatize(word, "v") for word in words]
words = [w for w in words if w not in stop_words and w not in punctuations]
clean_sent = " ".join(words)
# remove any non alphanum, digit character
# clean_sent = re.sub("\W+", " ", clean_sent)
# clean_sent = re.sub(" ", " ", clean_sent)
return clean_sent
sincere["clean_question_text"] = sincere["question_text"].apply(lambda question: clean_text(question))
insincere["clean_question_text"] = insincere["question_text"].apply(lambda question: clean_text(question))
insincere.head()
cv = CountVectorizer(min_df = 10,
max_features = 100000,
analyzer = "word",
ngram_range = (1, 2),
stop_words = "english",
token_pattern = '[a-zA-Z]')
count_vectors = cv.fit_transform(insincere["clean_question_text"])
# params = {"n_components": [5, 10, 20, 30, 40, 50]}
# lda_model = LatentDirichletAllocation(n_components = n_topics,
# # we choose a small n_components for time convenient
# # will find a appropriate n_components later
# learning_method = "online",
# batch_size = 128,
# evaluate_every = -1,
# max_iter = 20,
# random_state = 32,
# n_jobs = -1)
# model = GridSearchCV(lda_model, param_grid = params)
# model.fit(count_vectors)
# best_lda_model = model.best_estimator_
# best_lda_model
After applying Grid Search, we found the optimial n_components is between 5 to 10. In this case, we pick the 'mean' which is 8.
n_topics = 8
lda_model = LatentDirichletAllocation(n_components = n_topics,
learning_method = "online",
batch_size = 128,
evaluate_every = -1,
max_iter = 20,
random_state = 32,
n_jobs = -1)
question_topics = lda_model.fit_transform(count_vectors)
temp = question_topics
To get a better LDA model, we need to maximize log likelihood and minimize perplexity.
print("Log Likelihood: {} \nPerplexity: {}".format(lda_model.score(count_vectors),
lda_model.perplexity(count_vectors)))
tsne_model = TSNE(n_components = 2, verbose = 1, random_state = 32, n_iter = 500)
tsne_lda = tsne_model.fit_transform(question_topics)
question_topics = np.matrix(question_topics)
doc_topics = question_topics/question_topics.sum(axis = 1)
lda_keys = []
for i, tweet in enumerate(insincere["question_text"]):
lda_keys += [doc_topics[i].argmax()]
tsne_lda_df = pd.DataFrame(tsne_lda, columns = ["x", "y"])
tsne_lda_df["qid"] = insincere["qid"].values
tsne_lda_df["question"] = insincere["question_text"].values
tsne_lda_df["topics"] = lda_keys
tsne_lda_df["topics"] = tsne_lda_df["topics"].map(int)
import random
def generate_color():
color = "#{:02x}{:02x}{:02x}".format(*map(lambda x: random.randint(0, 255), range(3)))
return color
colormap = np.array([generate_color() for t in range(n_topics)])
plot_lda = bp.figure(plot_width = 700, plot_height = 600,
title = "LDA topics of Quora Questions",
tools = "pan, wheel_zoom, box_zoom, reset, hover, previewsave",
x_axis_type = None, y_axis_type = None, min_border = 1)
source = ColumnDataSource(data = dict(x = tsne_lda_df["x"], y = tsne_lda_df["y"],
color = colormap[lda_keys],
qid = tsne_lda_df["qid"],
question = tsne_lda_df["question"],
topics = tsne_lda_df["topics"]))
plot_lda.scatter(x = "x", y = "y", color = "color", source = source)
hover = plot_lda.select(dict(type = HoverTool))
hover.tooltips = {"qid": "@qid","question": "@question", "topics": "@topics"}
show(plot_lda)